get_upper_tri = function(cormat){
cormat[lower.tri(cormat)] = NA
diag(cormat) = NA
return(cormat)
}
p_dens = function(p_mat) {
p_vec = p_mat[upper.tri(p_mat)]
df_line = data.frame(xintercept = c(0.01, 0.05),
Lines = c("p = 0.01", "p = 0.05"),
stringsAsFactors = FALSE)
fig_p = data.frame(p = p_vec) %>%
ggplot(aes(x = p)) +
stat_ecdf(geom = "point") +
geom_vline(aes(xintercept = xintercept, color = Lines), df_line,
linetype = "dashed", size = 1) +
labs(x = "P-value", y = "Cumulative density") +
scale_color_discrete(name = NULL) +
theme_bw()
return(fig_p)
}
p_filter = function(mat, mat_p, max_p){
ind_p = mat_p
ind_p[mat_p > max_p] = 0
ind_p[mat_p <= max_p] = 1
mat_filter = mat * ind_p
return(mat_filter)
}
hard_thresh = function(R, th){
R_th = R
R_th[abs(R) <= th] = 0
return(R_th)
}
1. Data generation
1.1 Absolute abundances
set.seed(123)
n = 50
d = 100
mu = c(rep(NA, 4), # Taxa with correlations
10000, 10000, # High abundant taxa
sample(c(200, 50), d - 6, replace = TRUE, prob = c(0.8, 0.2))) # Low abundant taxa
# NB distribution
dispersion = 0.5
# Absolute abundances
A = matrix(NA, ncol = n, nrow = d)
for (i in seq.int(from = 5, to = d)) {
A[i, ] = rnbinom(n = n, size = 1/dispersion, mu = mu[i])
}
# Dependent pairs of taxa
template1 = rnbinom(n = n, size = 1/dispersion, mu = 2000)
template2 = rnbinom(n = n, size = 1/dispersion, mu = 2000)
template1_poly = poly(x = template1, degree = 1, raw = FALSE)
template2_poly = poly(x = template2, degree = 2, raw = FALSE)
poly1 = round(10000 * template1_poly[, 1] + 10000)
poly2 = round(10000 * template2_poly[, 2] + 10000)
A_dep = rbind(template1, template2, poly1, poly2)
for (i in 1:4) {
A[i, ] = A_dep[i, ]
}
mu[1:2] = 2000
mu[3:4] = 10000
taxa_id = paste0("T", seq_len(d))
sample_id = paste0("S", seq_len(n))
rownames(A) = taxa_id
colnames(A) = sample_id
p_A = data.frame(t(A[1:5, ])) %>%
ggpairs(title = "Synthetic Absolute Abundances",
progress = FALSE) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
strip.background = element_rect(fill = "white"))
print(p_A)

1.2 Observed abundances
# Sequencing efficiency
C = exp(rnorm(d, mean = 0, sd = 1))
# Microbial loads in the ecosystem
A_prim = A * C
A_dot = colSums(A_prim)
# Relative abundances in the ecosystem
R = A_prim/t(replicate(d, A_dot))
# Sampling fractions
S = rbeta(n = n, shape1 = 2, shape2 = 10)
# Library sizes
O_dot = round(S * A_dot)
# Observed abundances
O = matrix(NA, nrow = d, ncol = n)
for (i in seq(n)) {
O[, i] = rmultinom(1, size = O_dot[i], prob = R[, i])
}
rownames(O) = taxa_id
colnames(O) = sample_id
# Random errors
E = O/(A * C * t(replicate(d, S)))
# Log scale parameters
a = log(A)
c = log(C)
s = log(S)
e = log(E)
# Log scale variables
o = log(O)
r = O/t(replicate(d, O_dot))
p_O = data.frame(t(O[1:5, ])) %>%
ggpairs(title = "Synthetic Observed Abundances",
progress = FALSE) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
strip.background = element_rect(fill = "white"))
print(p_O)

1.3 Example: taxon 3 vs. taxon 4
# Absolute abundance
A_t34 = data.frame(t(A[3:4, ]))
ls_cor_test = cor.test(x = A_t34$T3, y = A_t34$T4, method = "pearson")
df_ann = data.frame(x = 12000, y = 14000) %>%
mutate(label = paste0("r = ", round(ls_cor_test$estimate, 2),
"\n", "p = ", round(ls_cor_test$p.value, 2)))
p_A_t34 = A_t34 %>%
ggplot(aes(x = T3, y = T4)) +
geom_point(size = 8, alpha = 0.6) +
labs(title = "Absolute Abundances for Taxon 3 and Taxon 4") +
geom_label(data = df_ann, aes(x = x, y = y, label = label),
size = 6, vjust = -0.5, hjust = 0, color = "black") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
p_A_t34

# Observed abundance
O_t34 = data.frame(t(O[3:4, ]))
ls_cor_test = cor.test(x = O_t34$T3, y = O_t34$T4, method = "pearson")
df_ann = data.frame(x = 400, y = 100) %>%
mutate(label = paste0("r = ", round(ls_cor_test$estimate, 2),
"\n", "p = ", round(ls_cor_test$p.value, 2)))
p_O_t34 = O_t34 %>%
ggplot(aes(x = T3, y = T4)) +
geom_point(size = 8, alpha = 0.6) +
labs(title = "Observed Abundances for T3 and T4") +
geom_label(data = df_ann, aes(x = x, y = y, label = label),
size = 6, vjust = -0.5, hjust = 0, color = "black") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
p_O_t34

2. Standard Pearson correlation
2.1 Correlation matrix
o[is.infinite(o)] = NA
res_sample = Hmisc::rcorr(t(o), type = "pearson")
corr_sample = res_sample$r
corr_sample_p = res_sample$P
diag(corr_sample_p) = 0
corr_sample_fl = p_filter(corr_sample, corr_sample_p, max_p = 0.005)
df = corr_sample_fl[1:5, 1:5]
df_p = data.frame(get_upper_tri(df)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))
p_sample = df_p %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Standard Pearson Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = c(0.3, 0.8),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5)) +
coord_fixed()
p_sample

2.2 P-value
p_dense_sample = p_dens(corr_sample_p)
p_dense_sample = p_dense_sample +
labs(title = "Standard Pearson Correlation") +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(p_dense_sample)
3. SparCC
corr_sparcc = sparcc(t(O))$Cor
colnames(corr_sparcc) = rownames(O)
rownames(corr_sparcc) = rownames(O)
corr_sparcc_th = hard_thresh(corr_sparcc, th = 0.3)
df = corr_sparcc_th[1:5, 1:5]
df_p = data.frame(get_upper_tri(df)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))
p_sparcc = df_p %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "SparCC") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = c(0.3, 0.8),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5)) +
coord_fixed()
p_sparcc

4. SECOM
OTU = otu_table(O, taxa_are_rows = TRUE)
meta_data = data.frame(sample_id = sample_id)
META = sample_data(meta_data)
sample_names(META) = meta_data$sample_id
otu_data = phyloseq(OTU, META)
pseqs = list(c(otu_data, otu_data))
pesudo = 0; zero_cut = 0.5; corr_cut = 0.5; lib_cut = 1000
wins_quant = c(0.05, 0.95); method = "pearson"; soft = FALSE; thresh_len = 20
n_cv = 10; seed = 123; thresh_hard = 0.3; max_p = 0.005; n_cl = 5
res_linear = secom_linear(pseqs, pesudo, zero_cut, corr_cut, lib_cut,
wins_quant, method, soft, thresh_len, n_cv,
seed, thresh_hard, max_p, n_cl)
max_p = 0.01; R = 1000
res_dist = secom_dist(pseqs, pesudo, zero_cut, corr_cut, lib_cut,
wins_quant, R, seed, max_p, n_cl)
4.1 Absolute abundance estimation
y_hat = res_linear$y_hat
p_y_hat = data.frame(t(y_hat[1:5, ])) %>%
ggpairs(title = "Estimated Absolute Abundances",
progress = FALSE) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
strip.background = element_rect(fill = "white"))
print(p_y_hat)

4.2 Linear correlation
4.21 Thresholding
df = res_linear$corr_th[1:5, 1:5]
df_p = data.frame(get_upper_tri(df)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))
p_secom1 = df_p %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "SECOM") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = c(0.3, 0.8),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5)) +
coord_fixed()
p_secom1

4.22 Filtering
df = res_linear$corr_fl[1:5, 1:5]
df_p = data.frame(get_upper_tri(df)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))
p_secom2 = df_p %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "SECOM") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = c(0.3, 0.8),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5)) +
coord_fixed()
p_secom2

4.23 P-value
p_dense_secom1 = p_dens(res_linear$corr_p)
p_dense_secom1 = p_dense_secom1 +
labs(title = "SECOM") +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(p_dense_secom1)
4.3 Distance correlation
4.31 Filtering
df = res_dist$dcorr_fl[1:5, 1:5]
df_p = data.frame(get_upper_tri(df)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))
p_secom3 = df_p %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "SECOM") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = c(0.3, 0.8),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5)) +
coord_fixed()
p_secom3

4.32 P-value
p_dense_secom2 = p_dens(res_dist$dcorr_p)
p_dense_secom2 = p_dense_secom2 +
labs(title = "SECOM") +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(p_dense_secom2)
5. Outputs
p_main1 = ggarrange(ggmatrix_gtable(p_A), ggmatrix_gtable(p_O),
ncol = 2, nrow = 1, labels = c("a", "b"))
p_main2 = ggarrange(p_sample, p_sparcc, p_secom3, ncol = 3,
labels = c("c", "d", "e"), common.legend = TRUE,
legend = "bottom")
p_main = ggarrange(p_main1, p_main2, ncol = 1, nrow = 2)
ggsave(plot = p_main, "../images/main/illustrate.pdf", height = 12, width = 12)
ggsave(plot = p_main, "../images/main/illustrate.jpeg", height = 12, width = 12, dpi = 300)
p_supp = ggarrange(p_dense_sample, p_dense_secom2,
ncol = 2, nrow = 1, labels = c("a", "b"),
common.legend = TRUE, legend = "bottom")
ggsave(plot = p_supp, "../images/supp/supp_p_value.pdf", height = 5, width = 10)
ggsave(plot = p_supp, "../images/supp/supp_p_value.jpeg", height = 5, width = 10, dpi = 300)